In [1]:
# Text
from colorama import Fore, Back, Style

# Computations
import numpy as np
import pandas as pd

# statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm

# PyStan
from pystan import StanModel

# preprocessing
from sklearn import preprocessing

# VSIualisation libraries

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## missingno
import missingno as msno

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

## arviz
import arviz as az

import warnings
warnings.filterwarnings("ignore")
Mercari Price Suggestion

In this article, we analyze and develop models using Bayesian Methods and PyStan for the Mercari Price Suggestion Challenge.

Description from Kaggle page:

Mercari, Japan’s biggest community-powered shopping app, knows this problem deeply. They’d like to offer pricing suggestions to sellers, but this is tough because their sellers are enabled to put just about anything, or any bundle of things, on Mercari's marketplace.

Table of Contents

The files consist of a list of product listings. These files are tab-delimited.

Columns Description
train_id or test_id the id of the listing
name the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage.
These removed prices are represented as (rm)
item_condition_id the condition of the items provided by the seller
category_name category of the listing
brand_name
price the price that the item was sold for. This is the target variable that you will predict. The unit is USD.
This column doesn't exist in test.tsv since that is what you will predict.
shipping 1 if shipping fee is paid by seller and 0 by buyer
item_description the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage.
These removed prices are represented as (rm)

We will model price for all categories. Our estimate of the parameter of product price can be considered a prediction. This article is inspired by A Primer on Bayesian Methods for Multilevel Modeling.

Loading the Dataset

In [2]:
Data = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep = '\t')
Data.columns = [x.title().replace('Id','ID') for x in Data.columns]
Data.rename(columns = {'Shipping':'Shipping Fee'}, inplace = True)
def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Size'] = Inp.shape[0]
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
Data Type Number of NaN Values Size Percentage
Brand_Name object 632682 1482535 42.68
Category_Name object 6327 1482535 0.43
Item_Condition_ID int64 0 1482535 0.00
Item_Description object 4 1482535 0.00
Name object 0 1482535 0.00
Price float64 0 1482535 0.00
Shipping Fee int64 0 1482535 0.00
Train_ID int64 0 1482535 0.00
In [3]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Category_Name'].agg({'count'}).\
            rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
del Temp0

C = ['greenyellow', 'seagreen']; SC = 'DarkGreen'
fig = px.bar(Temp, y= 'Category_Name', x= 'Count', orientation='h',
             color = 'Shipping Fee', text = 'Count', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 7e4])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [int(x*10e3) for x in np.arange(0,8)]))
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [4]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'mean'}).\
            rename(columns = {'mean':'Average Price'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0

C = ['MistyRose', 'FireBrick']; SC = 'DarkRed'
fig = px.bar(Temp, y= 'Category_Name', x= 'Average Price', orientation='h',
             color = 'Shipping Fee', text = 'Average Price', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 150])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [5]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'sum'}).\
            rename(columns = {'sum':'Total'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0

C = ['Azure', 'RoyalBlue']; SC = 'DarkBlue'
fig = px.bar(Temp, y= 'Category_Name', x= 'Total', orientation='h',
             color = 'Shipping Fee', text = 'Total', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 2.5e6])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Preprocessing

In [6]:
Data.dropna(subset = ['Category_Name'], inplace = True)
# a random sample of the dataset
Data = Data.sample(frac=0.01, random_state=99)
display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
Data Type Number of NaN Values Size Percentage
Brand_Name object 6310 14762 42.74
Category_Name object 0 14762 0.00
Item_Condition_ID int64 0 14762 0.00
Item_Description object 0 14762 0.00
Name object 0 14762 0.00
Price float64 0 14762 0.00
Shipping Fee int64 0 14762 0.00
Train_ID int64 0 14762 0.00
In [7]:
Temp = Data[['Price', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')

C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
                   color_discrete_sequence = C,
                   title = 'Probability Density (Price)')
fig['layout']['xaxis'].update(range=[0, 100])
fig['layout']['yaxis'].update(range=[0, .1])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()

le = preprocessing.LabelEncoder()
Data['Category'] = le.fit_transform(Data['Category_Name'])
Data['Price_Log'] = Data['Price'].apply(lambda x: np.log(x+0.1))

Category_df = Data.loc[Data.index.isin(Data['Category'].drop_duplicates().index),
                       ['Category_Name','Category']].sort_values(by=['Category']).reset_index(drop = True)

Temp = Data[['Price_Log', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')

C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price_Log', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
                   color_discrete_sequence = C,
                   title = 'Probability Density (Price (Logarithm))')
fig['layout']['xaxis'].update(range=[-4, 8])
fig['layout']['yaxis'].update(range=[0, 3])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()

Modeling

In this section, we would like to do a panel data analysis in which data are often collected over time and the same individuals. Then a regression is run over these two dimensions [1].

A common panel data regression model: $$y_{it}= \alpha + \beta x_{it}+\varepsilon _{it},$$ where

  • $y$: the dependent variable,
  • $x$: the independent variable,
  • $\alpha$ and $\beta$: coefficients,
  • $i$ and $t$: indices for individuals and time,
  • $\varepsilon _{it}$: error term.
    • In Fixed effects model: $\varepsilon _{it}$ is assumed to vary non-stochastically over $i$ or $t$.
    • In Random effects model: $\varepsilon _{it}$ is assumed to vary stochastically over $i$ or $t$.

For more details, please see [1, 2].

Useful functions throughout the article:

In [8]:
def df_search(df, key): return [s for s in df.columns if key in s]
def list_search(List, key): return [s for s in List if key in s]
def fit_vars(fit):
    Vars = list(fit.extract(permuted=True).keys())
    try:
        Vars.remove('lp__')
    except:
        pass
    try:
        Vars.remove('y_hat')
    except:
        pass
    Vars = sorted(Vars)
    return Vars

LaTex = {'alpha': r'$\alpha$', 'beta': r'$\beta$', 'sigma': r'$\sigma$',
         'sigma_alpha': r'$\sigma_{\alpha}$', 'sigma_beta': r'$\sigma_{\beta}$', 'sigma_y': r'$\sigma_{y}$',
         'mu_alpha': r'$\mu_{\alpha}$', 'mu_beta': r'$\mu_{\beta}$', 'y_hat': r'$\hat{y}$'}

Conventional Approaches

Complete Pooling

In complete pooling, we combine all the information from all the categories into a single pool of data. Thus, treat all cathegoris the same, and estimate a single price level. $$y_{i}= \alpha + \beta x_{i}+\varepsilon _{i},$$

  • $y_i$: the logarithm of the price in product $i$,
  • $\alpha$: Original price across all products in the data,
  • $\beta$: Effect on measured $\log$ (price),
  • $\epsilon_i$: the error term.

However, a problem with this approach is the level might be different for different categories.

The model can be defind with the following considersations: \begin{align} \begin{cases} \alpha~\sim~N\left(0,\sigma^2\right),\\ \beta~\sim~N\left(0,\sigma^2\right),\\ \end{cases} \end{align}

where $\sigma$ has a half-Cauchy distribution. A half-Cauchy is one of the symmetric halves of the Cauchy distribution (if it is unspecified, the right half that's intended)

Cauchy Probability Density Function: $$\text{Cauchy}(y|\mu,\sigma) = \frac{1}{\pi \sigma} \ \frac{1}{1 + \left((y - \mu)/\sigma\right)^2}.$$

Moreover, let

Parameter Description
x Shipping Fee
y Price (Log)

To implement this, we use StanModel API from PyStan.

In [9]:
model_code = """data {int<lower=0> N; vector[N] x; vector[N] y;}
                parameters {real alpha; real beta; real<lower=0> sigma;} 
                model {y ~ normal(alpha + beta * x, sigma);}"""

IT = int(1e3)
# Pooled model
model = StanModel(model_code = model_code)
CP_fit = model.sampling(data = {'N': Data.shape[0], 'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values}, iter= IT, chains=2)
Extract_Samples = CP_fit.extract(permuted=True)
CP_Parm = pd.DataFrame({'alpha': Extract_Samples['alpha'], 'beta': Extract_Samples['beta']})
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_411b73098686f98516e3786c03c23d3c NOW.

Complete Pooling Parameters

In [10]:
# Figures
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

_ = ax[0].scatter(x= CP_Parm.index, y= CP_Parm.alpha, color = 'SkyBlue', edgecolors = 'Navy', label = LaTex['alpha'])
_ = ax[0].hlines(CP_Parm.alpha.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[0].set_ylim([3.06, 3.16])

_ = ax[1].scatter(x= CP_Parm.index, y= CP_Parm.beta, color = 'LimeGreen', edgecolors = 'DarkGreen', label= LaTex['beta'])
_ = ax[1].hlines(CP_Parm.beta.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
                 color = 'Navy', lw = 3, label = LaTex['mu_beta'])
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[1].set_ylim([-.48, -.36])
In [11]:
Vars = ['alpha', 'beta', 'sigma']
ax = az.plot_trace(CP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [12]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'alpha = %.4f, beta = %.4f' % (CP_Parm.mean().values[0],
                                                                             CP_Parm.mean().values[1]))
t = np.linspace(-0.2, 1.2)
CP_fun = lambda x: CP_Parm.alpha.mean() + CP_Parm.beta.mean() *t

fig = go.Figure()

fig.add_trace(go.Scatter(x= Data['Shipping Fee'].values, y = Data['Price_Log'].values, mode='markers',
                         name='Shipping Fee - Price (Log)'))
fig.add_trace(go.Scatter(x= t, y= CP_fun(t), mode='lines', name='Fitted Model'))
fig.update_layout(title = 'Fitted Model', plot_bgcolor= 'WhiteSmoke',
                  xaxis_title= 'Shipping Fee', yaxis_title= 'Price (Log)', height = 500, width = 650)
fig['layout']['yaxis'].update(range=[-4, 8])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0,1], ticktext = ['Buyer', 'Seller']))
fig.show()
del t
alpha = 3.1047, beta = -0.4191

Note that this is similar to a Linear Regression model (Check this article for more details regarding Linear Regression model).

In [13]:
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp.columns = ['Price_Log', 'Shipping_Fee']
Results = smf.ols('Price_Log ~ Shipping_Fee', Temp).fit()
Results.summary().tables[1]
Out[13]:
coef std err t P>|t| [0.025 0.975]
Intercept 3.1050 0.009 359.542 0.000 3.088 3.122
Shipping_Fee -0.4196 0.013 -32.717 0.000 -0.445 -0.394

Here, Intercept is the same as $\alpha$ and the coefficient of Shipping Fee is the same as $\beta$.

No Pooling (Unpooled Model)

We assume that there is no connection at all between the Price (Log) levels in the different categories. In other words, we model price in each category independently. That is

$$y_{i}= \alpha_{j[i]} + \beta x_{i}+\epsilon_i,$$
  • $y_i$: the logarithm of the price in product $i$,
  • $\alpha_{j[i]}$: original price in category $j[i]$ for product $i$ and category $j$,
  • $\beta$: Effect on measured $\log$ (price),
  • $\epsilon_i$: the error term.

The model can be defind with the following considersations: \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]}, \beta \sim N(\mu, \sigma^2) \end{align}

In [14]:
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;} 
                parameters {vector[M] alpha; real beta; real<lower=0,upper=100> sigma;} 
                transformed parameters {vector[N] y_hat;
                for (i in 1:N)
                    y_hat[i] = alpha[category[i]] + beta * x[i];}
                model {y ~ normal(y_hat, sigma);}"""

IT = int(1e3)
model = StanModel(model_code = model_code) 
NP_fit = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values},
                        iter= IT, chains=2)

Extract_Samples = NP_fit.extract(permuted=True)
# Un-pooled (No Pool) Parmameters
NP_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
NP_Parm['beta'] = Extract_Samples['beta']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_8aa7bda1b995aaf0ac1bc89a13ee18d6 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [15]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6.5))

Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]

_ = ax.scatter(x= Temp.index, y= Temp['beta'], color = 'SkyBlue', edgecolors = 'Navy')
_ = ax.hlines(Temp.mean(), -1, len(Temp.index)+1, linestyles='-',
              color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax.set_title(r'$\beta$', fontsize = 16)
_ = ax.grid(True)
_ = ax.set_xlim([-1, len(NP_Parm.index)+1])
_ = ax.set_ylim([-.36, -.24])

del Temp
In [16]:
Vars = fit_vars(NP_fit)
ax = az.plot_trace(NP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [17]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(NP_fit, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
INFO:numba.core.transforms:finding looplift candidates

A plot of the ordered Estimates:

In [18]:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 8])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-1, 8])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [19]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = NP_Parm.copy()
beta = Temp['beta'].mean()
Temp = Temp[df_search(Temp, 'alpha')]

for alpha in Temp.mean(axis=0):
    ax.plot(t, alpha + beta*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha, beta, delta

A Visual comparisons between the pooled and unpooled estimates for a subset of categories:

In [20]:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})

s = 4
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
ax = ax.ravel()

lines = []
labels = []
for i, c in enumerate(Temp.Categories[:12]):
    y = Data.loc[Data.Category_Name==c, 'Price_Log'].values
    x = Data.loc[Data.Category_Name==c, 'Shipping Fee'].values
    NP_fun = lambda t: Temp.loc[Temp.Categories == c, 'mean'].values[0] + t*NP_Parm['beta'].mean()
    
    ax[i].scatter(x + np.random.randn(len(x))*0.01, y, alpha=0.4)
    t = np.linspace(-0.2, 1.2)
    ax[i].plot(t, NP_fun(t), 'k', lw = 2, label = 'Complete Pooling')
    ax[i].plot(t, CP_fun(t), 'r', lw = 2, label = 'No Pooling')
    ax[i].set_xticks([0,1])
    ax[i].set_title(c)
    ax[i].set_ylim([0, 8])
    ax[i].set_xlabel('Shipping Fee')
    ax[i].set_ylabel('Price (Log)')
    
    axLine, axLabel = ax[i].get_legend_handles_labels()
    lines.extend(axLine)
    labels.extend(axLabel)
    del axLine, axLabel
    
fig.legend(lines[:2], labels[:2], bbox_to_anchor=(0.2, 1), loc='lower left',
           ncol=2, title = 'Fitted Model', mode="expand", borderaxespad=0 , fontsize = 14)

plt.tight_layout()

Multilevel and Hierarchical models

There are Few approaches that we discuss here. For example,

  • Ignoring the effect of shipping, no $\beta$,
  • Defining a model that allows intercepts to vary across the category,
  • Defining a model that allows slopes to vary across the category,
  • Defining a model that allows both intercepts and slopes to vary across the category.

Partial Pooling - the Simplest

The simplest possible partial pooling model for the retail price dataset is one that simply estimates prices, $\alpha$, with no other predictors and ignoring the effect of shipping, $\beta$.

In doing so, let, $$y_i = \alpha_{j[i]} + \epsilon_i$$

where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}

In [21]:
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] y;} 
                parameters {vector[M] alpha; real mu_alpha;
                            real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;} 
                transformed parameters {vector[N] y_hat;
                for (i in 1:N)
                    y_hat[i] = alpha[category[i]];}
                model {mu_alpha ~ normal(0, 1); alpha ~ normal (mu_alpha, sigma_alpha); y ~ normal(y_hat, sigma_y);}"""
IT = int(1e3)
model = StanModel(model_code = model_code) 
PP_fit = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'y': Data['Price_Log'].values},
                        iter= IT, chains=2)

Extract_Samples = PP_fit.extract(permuted=True)
# Partial Pooling Parmameters
PP_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0727cfb6bf80e3019e25eccebf3d9ffa NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [22]:
Vars = ['sigma_y', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(PP_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [23]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(PP_fit, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [24]:
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 5])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp

Standard Error $$SE = \frac{\sigma} {\sqrt{n}}$$

  • $\text{SE}$ = standard error of the sample
  • $\sigma$ = sample standard deviation
  • $n$ = number of samples
In [25]:
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]

Group = pd.DataFrame({'n': Data.groupby('Category_Name')['Train_ID'].count(),
                         'mean': Data.groupby('Category_Name')['Price_Log'].mean(),
                         'sd': Data.groupby('Category_Name')['Price_Log'].std()})
Group['se'] = Group['sd']/np.sqrt(Group['n'])

# deviations
jitter = np.random.normal(scale=0.1, size= Temp.shape[1])

fig, ax = plt.subplots(1, 2, figsize=(16,8))
_ = ax[0].scatter(Group['n'] + jitter, Group['mean'], color = 'SkyBlue', edgecolors = 'Navy')
for j, row in zip(jitter, Group.iterrows()):
    n, mean, se = n, mean, se = row[1]['n'], row[1]['mean'], row[1]['se']
    _ = ax[0].plot([n + j, n + j], [mean - se, mean + se], 'k-', alpha = 0.5)
_ = ax[0].set_xscale('log')
_ = ax[0].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[0].set_ylim([0,6])
_ = ax[0].set_xlim([0,1e3])
_ = ax[0].set_title('No Pool Model Estimates Mean')

_ = ax[1].scatter(Group['n'] + jitter, Temp.mean(axis=0), color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[1].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[1].set_xscale('log')
_ = ax[1].set_xlim([0,1e3])
_ = ax[1].set_ylim([0,6])
_ = ax[1].set_title('Partial Pool Model Estimates Mean')
for j, n , mean , std in zip(jitter, Group['n'].values, Temp.mean(axis=0), Temp.std(axis=0)):
    _ = ax[1].plot([n+j, n+j], [mean- std, mean + std], 'k-', alpha = 0.5)
    
del j, n, mean, std, Temp, Group

The unpooled estimates are more imprecise than the partial pool estimates.

Partial Pooling - Varying Intercept

This model allows intercepts to vary across the category, according to a random effect.

$$y_i = \alpha_{j[i]} + \beta x_{i} + \epsilon_i$$

where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}

In [26]:
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;} 
                parameters {vector[M] alpha; real beta; real mu_alpha;
                real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;} 
                transformed parameters {
                vector[N] y_hat;
                for (i in 1:N)
                    y_hat[i] = alpha[category[i]] + x[i]*beta;}
                model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
                beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}"""

IT = int(1e3)
model = StanModel(model_code = model_code) 
VI_fit = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values},
                        iter= IT, chains=2)
Extract_Samples = VI_fit.extract(permuted=True)
# Varying Intercept Parmameters
VI_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
VI_Parm['beta'] = VI_fit['beta']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_b99073f13080d6f65bb96e4568e5d8f1 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [27]:
Vars = fit_vars(VI_fit)
ax = az.plot_trace(VI_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [28]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(VI_fit, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [29]:
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [30]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VI_Parm.copy()

for alpha in Temp[df_search(Temp, 'alpha')].mean(axis=0):
    ax.plot(t, alpha + Temp['beta'].mean()*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha

Prediction

Lets's pick a category. For example,

In [31]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [32]:
model_code = """data {int<lower=0> N; int<lower=0> M;
                int<lower=1,upper=M> category[N];
                int<lower=0,upper=M> pred;
                vector[N] x; vector[N] y;
                real x_pred;} 
                parameters {vector[M] alpha; real beta; real mu_alpha;
                real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;} 
                transformed parameters {vector[N] y_hat; real pred_mu;
                for (i in 1:N)
                    y_hat[i] = alpha[category[i]] + x[i]*beta;
                
                pred_mu = alpha[pred + 1] + x_pred * beta;}
                model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
                beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}
                generated quantities {real y_pred;
                    y_pred = normal_rng(pred_mu, sigma_y);}
                """

IT = int(1e3)
model = StanModel(model_code = model_code) 
VI_pred = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values,
                                'pred': Selected_Category,
                                'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
                                },
                        iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_2d2f1b23c36143764c9da57b5a5630f7 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [33]:
_ = az.plot_trace(VI_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(VI_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [34]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = VI_pred.extract(permuted=True)['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [35]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376

Partial Pooling - Varying Slope model

An alternative would be

$$y_i = \alpha + \beta_{j[i]} x_{i} + \epsilon_i$$

where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2)\\ \alpha \sim N(0, \sigma^2)\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2) \end{cases} \end{align}

This model highlights the relationship between measured the logarithm of the price, the prevailing price level, and the effect of who pays the shipping fee at which the measurement was made.

In [36]:
model_code = """data {int<lower=0> N; int<lower=0> M; int<lower=1,upper=M> category[N]; vector[N] x; vector[N] y;}
                parameters {real alpha; vector[M] beta; real mu_beta; real<lower=0,upper=100> sigma_beta;
                real<lower=0,upper=100> sigma_y;} 
                transformed parameters {
                vector[N] y_hat;
                for (i in 1:N)
                    y_hat[i] <- alpha + beta[category[i]] * x[i];}
                model {sigma_beta ~ uniform(0, 100); beta ~ normal (mu_beta, sigma_beta);
                alpha ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}"""
IT = int(1e3)
model = StanModel(model_code = model_code) 
VS_fit = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values},
                        iter= IT, chains=2)
Extract_Samples = VS_fit.extract(permuted=True)
# Varying Slope Parmameters
VS_Parm = pd.DataFrame(Extract_Samples['beta'], columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])
VS_Parm['alpha'] = VS_fit['alpha']
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_65e6829fce2dc08bd6b7cd5b2e245975 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [37]:
Vars = fit_vars(VS_fit)
Vars = sorted(Vars)
ax = az.plot_trace(VS_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [38]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(VS_fit, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [39]:
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-2, 2])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-3, 3])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [40]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VS_Parm.copy()

for beta in Temp[df_search(Temp, 'beta')].mean(axis=0):
    ax.plot(t, Temp['alpha'].mean() + beta*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')

Prediction

In [41]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [42]:
model_code = """data {int<lower=0> N; int<lower=0> M;
                int<lower=1,upper=M> category[N];
                int<lower=0,upper=M> pred;
                vector[N] x; vector[N] y;
                real x_pred;} 
                parameters {real alpha; vector[M] beta; real mu_alpha;
                real<lower=0,upper=100> sigma_alpha; real<lower=0,upper=100> sigma_y;} 
                transformed parameters {vector[N] y_hat; real pred_mu;
                for (i in 1:N)
                    y_hat[i] = alpha + x[i]*beta[category[i]];
                
                pred_mu = alpha + x_pred * beta[pred + 1];}
                model {sigma_alpha ~ uniform(0, 100); alpha ~ normal (mu_alpha, sigma_alpha);
                beta ~ normal (0, 1); sigma_y ~ uniform(0, 100); y ~ normal(y_hat, sigma_y);}
                generated quantities {real y_pred;
                    y_pred = normal_rng(pred_mu, sigma_y);}
                """

IT = int(1e3)
model = StanModel(model_code = model_code) 
VS_pred = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values,
                                'pred': Selected_Category,
                                'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
                                },
                        iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c440dcc8f99272d7ac526fced2cc6489 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
WARNING:pystan:44 of 1000 iterations ended with a divergence (4.4 %).
WARNING:pystan:Try running with adapt_delta larger than 0.8 to remove the divergences.
In [43]:
_ = az.plot_trace(VS_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(VS_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [44]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = VS_pred['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [45]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376

Partial Pooling - Varying Slope and Intercept

The most general model allows both the intercept and slope to vary by category:

$$y_i = \alpha_{j[i]} + \beta_{j[i]} x_{i} + \epsilon_i$$

where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2),\\ \alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2),\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2). \end{cases} \end{align}

In [46]:
model_code = """data {int<lower=0> N; int<lower=0> M; vector[N] x; vector[N] y; int category[N];}
                parameters {vector[M] alpha; vector[M] beta;
                            real mu_alpha; real mu_beta;
                            real<lower=0> sigma; real<lower=0> sigma_alpha; real<lower=0> sigma_beta;}
                model {mu_alpha ~ normal(0, 100); mu_beta ~ normal(0, 100);
                        alpha ~ normal(mu_alpha, sigma_alpha); beta ~ normal(mu_beta, sigma_beta);
                        y ~ normal(alpha[category] + beta[category].*x, sigma);}"""
IT = int(1e3)
model = StanModel(model_code = model_code) 
VSI_fit = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values},
                        iter= IT, chains=2)
Extract_Samples = VSI_fit.extract(permuted=True)
# Varying Slope Parmameters
VSI_Parm = pd.DataFrame(Extract_Samples['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
Temp = pd.DataFrame(Extract_Samples['beta'], columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])
VSI_Parm = pd.concat([VSI_Parm,Temp], axis =1)
del Extract_Samples
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_07cfdb859f961538dc97402c518711de NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [47]:
Vars = fit_vars(VSI_fit)
ax = az.plot_trace(VSI_fit, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [48]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(VSI_fit, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [49]:
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [50]:
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 1])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in the Effect of the Paid Shipping Fees by Category', fontsize = 18)

# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-2, 2])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [51]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VSI_Parm.copy()

for a, b in zip(Temp[df_search(Temp, 'alpha')].mean(axis=0), Temp[df_search(Temp, 'beta')].mean(axis=0)):
    ax.plot(t, a + b*t, color='DarkBlue', marker='o', linestyle='--', linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')

Prediction

In [52]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [53]:
model_code = """data {int<lower=0> N; int<lower=0> M;
                int<lower=1,upper=M> category[N];
                int<lower=0,upper=M> pred;
                vector[N] x; vector[N] y;
                real x_pred;} 
                parameters {vector[M] alpha; vector[M] beta;
                real mu_alpha; real mu_beta;
                real<lower=0> sigma; real<lower=0> sigma_alpha; real<lower=0> sigma_beta;
                real y_pred;} 
                model {mu_alpha ~ normal(0, 100); mu_beta ~ normal(0, 100);
                    alpha ~ normal(mu_alpha, sigma_alpha); beta ~ normal(mu_beta, sigma_beta);
                    y ~ normal(alpha[category] + beta[category].*x, sigma);
                    y_pred ~ normal(alpha[pred + 1] + beta[pred + 1].*x_pred, sigma);}
                """

IT = int(1e3)
model = StanModel(model_code = model_code) 
VSI_pred = model.sampling(data = {'N': Data.shape[0],
                                'M': len(Data['Category'].unique()),
                                'category': Data['Category'].values + 1,
                                'x': Data['Shipping Fee'].values,
                                'y': Data['Price_Log'].values,
                                'pred': Selected_Category,
                                'x_pred': Data.loc[Data.Category == Selected_Category,'Shipping Fee'].values.mean()
                                },
                        iter= IT, chains=2)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0e0980e970985f4a6f6b12bc59a9f027 NOW.
WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
In [54]:
_ = az.plot_trace(VSI_pred, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(VSI_pred, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [55]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = VSI_pred['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [56]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376